# ==============================================================================
# file name: 4-describe-data.R
# date:	Nov 9, 2023
# author: Bernhard Clemm
# purpose: reproduce all descriptives of data reported in paper before results
# ==============================================================================

rm(list = ls())

# SAMPLE SIZES AND VISITS ======================================================

## Section 3 ####

# "Our final Facebook sample has nsubjects = 707 and nwebvisits = 16.4 million in the first
# wave (up to 90 days), the Lucid sample nsubjects = 2,222 and nwebvisits = 73.8 million in the
# first wave (up to 90 days), and the YouGov sample nsubjects = 957 and nwebvisits = 6.4 million
# (up to 60 days)."

fb <- read.csv("data/analysis_FB.csv") %>%
  filter(n_days_active >= 7 & wave == 1)
print(paste("n subjects =", nrow(fb)))
print(paste("n web visits =", round(sum(fb$n_total) / 1000000, 1), "million"))

lu <- read.csv("data/analysis_LU.csv") %>%
  filter(n_days_active >= 7 & wave == 1)
print(paste("n subjects =", nrow(lu)))
print(paste("n web visits =", round(sum(lu$n_total) / 1000000, 1), "million"))

yg <- read.csv("data/analysis_YG.csv") %>%
  filter(n_days_active >= 7)
print(paste("n subjects =", nrow(yg)))
print(paste("n web visits =", round(sum(yg$n_total) / 1000000, 1), "million"))

# DONORS VS. NON-DONORS ========================================================

## Section 3.1 ####

# "On Facebook, out of 2,775 people who participated in our study, there were 820 (29.5%) donors
# (of which 707 had more than seven days of browsing data and were thus included in the
# main analysis) and 1,955 non-donors. On Lucid, out of 15,589 initial participants, there were
# 2,462 (15.8%) donors (of which 2,222 had more than seven days of browsing) and 13,127 non-
# donors.... On Yougov, we then have 1,179 donors (of which 957 had more than
# seven days of browsing) and 4,543 non-donors."

survey_fb_all <- read.csv("data/surveys_processed/survey_FB_all.csv")
print(paste("n all =", nrow(survey_fb_all)))
print(paste("n donors =", nrow(survey_fb_all[survey_fb_all$donor == 1, ])))
print(paste("% donors =", round(nrow(survey_fb_all[survey_fb_all$donor == 1, ]) * 100 / nrow(survey_fb_all), 1)))
print(paste("n non-donors =", nrow(survey_fb_all[survey_fb_all$donor == 0, ])))

survey_lu_all <- read.csv("data/surveys_processed/survey_LU_all.csv")
print(paste("n all =", nrow(survey_lu_all)))
print(paste("n donors =", nrow(survey_lu_all[survey_lu_all$donor == 1, ])))
print(paste("% donors =", round(nrow(survey_lu_all[survey_lu_all$donor == 1, ]) * 100 / nrow(survey_lu_all), 1)))
print(paste("n non-donors =", nrow(survey_lu_all[survey_lu_all$donor == 0, ])))

survey_yg_all <- read.csv("data/surveys_processed/survey_YG_all.csv")
print(paste("n donors =", nrow(survey_yg_all[survey_yg_all$donor == 1, ])))
print(paste("n non-donors =", nrow(survey_yg_all[survey_yg_all$donor == 0, ])))

## SM B.1.2 Comparing donors and non-donors: Table B.1 ####

outcomes <- c(
  "age", "female", "edu_high", "white",
  "party", "ideo", "ft_outparty",
  "int_politics", "foll_politics", "polknow"
)

## Functions ####

roundr <- function(value, dgts = 3) {
  return(format(round(value, digits = dgts), nsmall = dgts))
}

make_table <- function(dt, donors_var) {
  summ <- dt %>%
    select({{ donors_var }}, any_of(outcomes)) %>%
    group_by({{ donors_var }}) %>%
    get_summary_stats(show = c("mean", "median", "se")) %>%
    # For age, use median instead of mean
    mutate(mean = ifelse(variable == "age", median, mean)) %>%
    select(-median, -n) %>%
    # Create percentages instead of proportions
    mutate(
      mean = ifelse(
        variable %in% c("female", "edu_high", "white"),
        mean * 100, mean
      ),
      se = ifelse(
        variable %in% c("female", "edu_high", "white"),
        se * 100, se
      )
    ) %>%
    # Rounding depending on variable type:
    ## age gets 0 decimals as it's a median
    ## 0-1 variables get two decimals
    mutate(
      mean = case_when(
        variable %in% c("age") ~ roundr(mean, 0),
        variable %in% c("female", "edu_high", "white", "party", "ft_outparty") ~ roundr(mean, 1),
        .default = roundr(mean, 2)
      ),
      se = case_when(
        variable %in% c("age") ~ roundr(se, 0),
        variable %in% c("female", "edu_high", "white", "party", "ft_outparty") ~ roundr(se, 1),
        .default = roundr(se, 2)
      )
    ) %>%
    mutate(donors = ifelse({{ donors_var }} == 1, "donor", "nondonor")) %>%
    select(-{{ donors_var }}) %>%
    mutate(across(c(mean, se), ~ trimws(.))) %>%
    pivot_wider(names_from = "donors", values_from = c("mean", "se"))
  tab <- summ %>%
    mutate(
      donor = paste0(mean_donor, " (", se_donor, ")"),
      nondonor = paste0(mean_nondonor, " (", se_nondonor, ")")
    ) %>%
    mutate(variable = factor(variable, ordered = TRUE, levels = outcomes)) %>%
    arrange(variable) %>%
    mutate(variable = gsub("_w[0-9]|_num", "", .$variable))
  return(tab)
}

ttest_p <- function(var_outcome, dat, var_prof) {
  result <- t.test(dat[, var_outcome] ~ dat[, var_prof])
  return(result$p.value)
}
chisq_p <- function(var_outcome, dat, var_prof) {
  result <- chisq.test(table(dat[, var_outcome], dat[, var_prof]))
  return(result$p.value)
}
ks_p <- function(var_outcome, dat, var_prof) {
  dt_pro <- dat[dat[, var_prof] == 1, var_outcome]
  dt_nonpro <- dat[dat[, var_prof] == 0, var_outcome]
  result <- ks.test(dt_pro, dt_nonpro)
  return(result$p.value)
}

## Facebook ####

summary_fb_donors <- make_table(survey_fb_all, donor)

pvalues_fb_donors <- c(
  sapply(
    c("female", "edu_high", "white", "age"), chisq_p,
    dat = survey_fb_all,
    var_prof = "donor", simplify = F
  )
)

pvalues_fb_donors <- data.frame(
  p = unlist(pvalues_fb_donors), variable = names(unlist(pvalues_fb_donors))
)

summary_fb_donors <- summary_fb_donors %>%
  left_join(., pvalues_fb_donors) %>%
  mutate(sig = case_when(
    p < 0.001 ~ "\\star\\star\\star",
    p < 0.01 ~ "\\star\\star",
    p < 0.05 ~ "\\star",
    p < 0.10 ~ "\\circ"
  )) %>%
  ###########################
  # for exporting HTML only
  # mutate(sig = case_when(
  #   p < 0.001 ~ "***",
  #   p < 0.01 ~ "**",
  #   p < 0.05 ~ "*",
  #   p < 0.10 ~ "○"
  # )) %>%
  ###########################
  mutate(
    donor = ifelse(variable == "age", gsub(" \\(.+\\)", "", donor), donor),
    nondonor = ifelse(variable == "age", gsub(" \\(.+\\)", "", nondonor), nondonor)
  ) %>%
  mutate(across(c(donor, nondonor), ~ case_when(
    . == "4" ~ "30-34",
    . == "5" ~ "35-39",
    . == "6" ~ "40-44",
    .default = .
  )))

## Lucid ####

survey_lu_all <- survey_lu_all %>%
  rename("party" = party_w0) %>%
  mutate(
    ideo = (ideo_w0 - 0) / (10 - 0),
    donor = as.factor(donor)
  )

summary_lucid_donors <- make_table(survey_lu_all, donor)

pvalues_lucid_donors <- c(
  sapply(
    c("age"), ks_p,
    dat = survey_lu_all,
    var_prof = "donor", simplify = F
  ),
  sapply(
    c("female", "edu_high", "white"), chisq_p,
    dat = survey_lu_all,
    var_prof = "donor", simplify = F
  ),
  sapply(
    c("party", "ideo"), ttest_p,
    dat = survey_lu_all,
    var_prof = "donor", simplify = F
  )
)

pvalues_lucid_donors <- data.frame(
  p = unlist(pvalues_lucid_donors), variable = names(unlist(pvalues_lucid_donors))
)

summary_lucid_donors <- summary_lucid_donors %>%
  left_join(., pvalues_lucid_donors) %>%
  mutate(sig = case_when(
    p < 0.001 ~ "\\star\\star\\star",
    p < 0.01 ~ "\\star\\star",
    p < 0.05 ~ "\\star",
    p < 0.10 ~ "\\circ"
  )) %>%
  ###########################
  # for exporting HTML only
  # mutate(sig = case_when(
  #   p < 0.001 ~ "***",
  #   p < 0.01 ~ "**",
  #   p < 0.05 ~ "*",
  #   p < 0.10 ~ "○"
  # )) %>%
  ###########################
  mutate(
    donor = ifelse(variable == "age", gsub(" \\(.+\\)", "", donor), donor),
    nondonor = ifelse(variable == "age", gsub(" \\(.+\\)", "", nondonor), nondonor)
  )

## YouGov ####

summary_yg_donors <- make_table(survey_yg_all, donor)

pvalues_yg_donors <- c(
  sapply(
    c("age"), ks_p,
    dat = survey_yg_all,
    var_prof = "donor", simplify = F
  ),
  sapply(
    c("female", "edu_high", "white"), chisq_p,
    dat = survey_yg_all,
    var_prof = "donor", simplify = F
  ),
  sapply(
    c(
      "party", "ideo", "ft_outparty", "int_politics",
      "foll_politics", "polknow"
    ), ttest_p,
    dat = survey_yg_all,
    var_prof = "donor", simplify = F
  )
)

pvalues_yg_donors <- data.frame(
  p = unlist(pvalues_yg_donors), variable = names(unlist(pvalues_yg_donors))
)

summary_yg_donors <- summary_yg_donors %>%
  left_join(., pvalues_yg_donors) %>%
  mutate(sig = case_when(
    p < 0.001 ~ "\\star\\star\\star",
    p < 0.01 ~ "\\star\\star",
    p < 0.05 ~ "\\star",
    p < 0.10 ~ "\\circ"
  )) %>%
  ###########################
  # for exporting HTML only
  # mutate(sig = case_when(
  #   p < 0.001 ~ "***",
  #   p < 0.01 ~ "**",
  #   p < 0.05 ~ "*",
  #   p < 0.10 ~ "○"
  # )) %>%
  ###########################
  mutate(
    donor = ifelse(variable == "age", gsub(" \\(.+\\)", "", donor), donor),
    nondonor = ifelse(variable == "age", gsub(" \\(.+\\)", "", nondonor), nondonor)
  )

## TABLE ####

summary_all_donors <- summary_lucid_donors %>%
  select(-p) %>%
  select(variable, "lucid_data_donors" = donor, "lucid_non_data_donors" = nondonor, "lucid_sig" = sig) %>%
  full_join(., summary_yg_donors %>%
    select(variable, "yg_data_donors" = donor, "yg_non_data_donors" = nondonor, "yg_sig" = sig)) %>%
  full_join(., summary_fb_donors %>%
    select(variable, "fb_data_donors" = donor, "fb_non_data_donors" = nondonor, "fb_sig" = sig)) %>%
  mutate(varname = case_when(
    variable == "age" ~ "Age (median years) ",
    variable == "female" ~ "Gender (\\% female)",
    variable == "edu_high" ~ "Education (\\% Bachelor or more)",
    variable == "white" ~ "Ethnicity (\\% white)",
    variable == "ideo" ~ "Ideology (0-1)",
    variable == "party" ~ "Partisanship (1-7)",
    variable == "ft_outparty" ~ "Thermometer out-party (1-100)",
    variable == "int_politics" ~ "Political interest (0-1)",
    variable == "polknow" ~ "Political knowledge (0-1)",
    variable == "foll_politics" ~ "Following politics (0-1)"
  )) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), "", .))) %>%
  select(
    varname,
    fb_data_donors, fb_sig, fb_non_data_donors,
    lucid_data_donors, lucid_sig, lucid_non_data_donors,
    yg_data_donors, yg_sig, yg_non_data_donors
  )

kable(
  summary_all_donors,
  caption = "Data donors vs. non-donors",
  format = "html", booktabs = T, escape = F, linesep = "",
  row.names = F, label = "sociodem-b",
  col.names = c(
    "", "Donors", "", "Non-donors",
    "Donors", "", "Non-donors",
    "Donors", "", "Non-donors"
  )
) %>%
  kable_styling(full_width = F, latex_options = c("HOLD_position", "scale_down")) %>%
  add_header_above(c(" " = 1, "Facebook" = 3, "Lucid" = 3, "Yougov" = 3)) %>%
  pack_rows("Sociodemographics", 1, 4) %>%
  pack_rows("Political outcomes", 5, 10) %>%
  column_spec(1, width = "7cm") %>%
  footnote(
    general = "Standard errors for means in parentheses. Significance of differences between professionals and non-professionals were tested with a Kolgomorov-Smirnoff test for age, chi-squared tests for gender, education and race, and t-tests for all other variables (\\\\circ p < 0.1; \\\\star p < 0.05; \\\\star\\\\star p < 0.01; \\\\star\\\\star\\\\star p < 0.001). Sociodemographic population data from the US Census; political variables from ANES 2020. Variables trust, political interest, knowledge and partisanship were recoded to a scale from 0 to 1 to ensure comparability.",
    threeparttable = T
  ) %>%
  save_kable(., file = "output/tabB1_donors_vs_nondonors.html")

# SURVEY SITE MEASUREMENT ======================================================

## Section 3.2 ####

# "Inter-coder reliability for the overlaps was high (Facebook: 92% agreement;
# Lucid: 88%; YouGov: 94%)."

hosts_500_FB_coded <- read.csv("data/browsing_hosts/hosts_500_FB_coded.csv") %>%
  select(url_host, code_1, code_2) %>%
  filter(!is.na(code_1) & !is.na(code_2)) %>%
  mutate(agreement = ifelse(code_1 == code_2, 1, 0))
hosts_500_LUCID_coded <- read.csv("data/browsing_hosts/hosts_500_LU_coded.csv") %>%
  select(url_host, code_1, code_2) %>%
  filter(!is.na(code_1) & !is.na(code_2)) %>%
  mutate(agreement = ifelse(code_1 == code_2, 1, 0))
hosts_500_YG_coded <- read.csv("data/browsing_hosts/hosts_500_YG_coded.csv") %>%
  select(url_host, code_1, code_2) %>%
  filter(!is.na(code_1) & !is.na(code_2)) %>%
  mutate(agreement = ifelse(code_1 == code_2, 1, 0))

prop.table(table(hosts_500_FB_coded$agreement))
prop.table(table(hosts_500_LUCID_coded$agreement))
prop.table(table(hosts_500_YG_coded$agreement))

# B.3 SELF-REPORTED MEASURES ===================================================

vars <- read.xlsx("data/surveys_raw/survey_variables.xlsx", sheet = 1)

vars_tab <- vars %>% select(-variable)
vars_tab$question <- linebreak(vars_tab$question)

kable(
  vars_tab,
  caption = "Wording of political survey variables",
  format = "html", booktabs = T, escape = F, linesep = "", longtable = T,
  row.names = F, label = "var-description",
  col.names = c("Dataset", "Question wording", "Response scale", "Recoded scale")
) %>%
  kable_styling(font_size = 7, latex_options = c("repeat_header")) %>%
  pack_rows("Partisanship", 1, 4) %>%
  pack_rows("Ideology", 5, 8) %>%
  pack_rows("Out-party feeling", 9, 12) %>%
  pack_rows("Political interest", 13, 16) %>%
  pack_rows("Political knowledge", 17, 19) %>%
  pack_rows("Following politics in the media", 20, 21) %>%
  column_spec(2, width = "12cm") %>%
  save_kable(., file = "output/tabB3_survey_variables.html")
